Analysis of Employee Retention by DDSAnalytics

A video presentation of this data can be found at: https://www.youtube.com/watch?v=s4M4MJ1SODw&feature=youtu.be

In this report, we are trying to predict employee retention based on a number of features.

# load in the libraries
library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(corrplot)
## corrplot 0.84 loaded
library(corrgram)
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
## 
##     panel.fill
library(naniar)
library(gridExtra)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ✓ purrr   0.3.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x purrr::lift()    masks caret::lift()
attr = read_csv(
  "./data/CaseStudy2-data.csv",
  col_types = cols(
    Attrition = col_character(),
    BusinessTravel = col_factor(),
    Department = col_factor(),
    EducationField = col_factor(),
    Gender = col_factor(),
    JobRole = col_factor(),
    MaritalStatus = col_factor(),
    OverTime = col_factor(),
    Age = col_integer(),
    DailyRate = col_integer(),
    DistanceFromHome = col_integer(),
    EmployeeNumber = col_integer(),
    HourlyRate = col_integer(),
    MonthlyIncome = col_integer(),
    MonthlyRate = col_integer(),
    NumCompaniesWorked = col_integer(),
    PercentSalaryHike = col_integer(),
    TotalWorkingYears = col_integer(),
    TrainingTimesLastYear = col_integer(),
    YearsAtCompany = col_integer(),
    YearsInCurrentRole = col_integer(),
    YearsSinceLastPromotion = col_integer(),
    YearsWithCurrManager = col_integer(),
    Education = col_integer(),
    EnvironmentSatisfaction = col_integer(),
    JobInvolvement = col_integer(),
    JobLevel = col_integer(),
    JobSatisfaction = col_integer(),
    PerformanceRating = col_integer(),
    RelationshipSatisfaction = col_integer(),
    StockOptionLevel = col_integer(),
    WorkLifeBalance = col_integer()
  )
)
attr = attr %>% select(-one_of( c("ID", "EmployeeCount", "Over18", "StandardHours")))

We see that there are no missing values.

gg_miss_var(attr)

continuous_features = attr %>% select(
  Age,
  DailyRate,
  DistanceFromHome,
  EmployeeNumber,
  HourlyRate,
  MonthlyIncome,
  MonthlyRate,
  NumCompaniesWorked,
  PercentSalaryHike,
  TotalWorkingYears,
  TrainingTimesLastYear,
  YearsAtCompany,
  YearsInCurrentRole,
  YearsSinceLastPromotion,
  YearsWithCurrManager,
  Education,
  EnvironmentSatisfaction,
  JobInvolvement,
  JobLevel,
  JobSatisfaction,
  PerformanceRating,
  RelationshipSatisfaction,
  StockOptionLevel,
  WorkLifeBalance
)

factors = attr %>% select(
  BusinessTravel,
  Department,
  EducationField,
  Gender,
  JobRole,
  MaritalStatus,
  OverTime
)

First we look at the collinearity of the continuous variables and notice that manu of the year realted vairables are correlated

corrgram(continuous_features)

uncorrelated_features = c(
  'DailyRate', 
  'DistanceFromHome', 
  'EmployeeNumber', 
  'HourlyRate', 
  'MonthlyRate', 
  'TrainingTimesLastYear', 
  'EnvironmentSatisfaction', 
  'JobInvolvement', 
  'JobSatisfaction', 
  'RelationshipSatisfaction', 
  'StockOptionLevel', 
  'WorkLifeBalance'
)
feat1  = attr %>% select(Attrition, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)
ggpairs(feat1, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Breaking down the years features, we do see there is some realation between these features and attrition. This relationship is easier to see on a log scale.

attr$LogYearsAtCompany = log(attr$YearsAtCompany + 1)
attr$LogYearsInCurrentRole = log(attr$YearsInCurrentRole + 1)
attr$LogYearsSinceLastPromotion = log(attr$YearsSinceLastPromotion + 1)
attr$LogYearsWithCurrManager = log(attr$YearsWithCurrManager + 1)
feat2 = attr %>% select(Attrition, LogYearsAtCompany, LogYearsInCurrentRole, LogYearsSinceLastPromotion, LogYearsWithCurrManager)
ggpairs(feat2, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

attr$YearsBeforeCompany = attr$TotalWorkingYears - attr$YearsAtCompany
attr$LogTotalWorkingYears = log(attr$TotalWorkingYears + 1)
feat3  = attr %>% select(Attrition, Age, MonthlyIncome, YearsBeforeCompany, JobLevel)
ggpairs(feat3, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Age and income also look quite important when looking at attrition

attr$LogMonthlyIncome = log(attr$MonthlyIncome + 1)
attr$LogYearsBeforeCompany= log(attr$YearsBeforeCompany + 1)
feat3  = attr %>% select(Attrition, Age, LogMonthlyIncome, LogYearsBeforeCompany, JobLevel)
ggpairs(feat3, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

feat4  = attr %>% select(Attrition, PercentSalaryHike, PerformanceRating, NumCompaniesWorked, Education, YearsBeforeCompany)
ggpairs(feat4, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

attr$LogNumCompaniesWorked = log(attr$NumCompaniesWorked + 1)
attr$LogYearsBeforeCompany = log(attr$YearsBeforeCompany + 1)
feat5  = attr %>% select(Attrition, PercentSalaryHike, PerformanceRating, LogNumCompaniesWorked, Education, LogYearsBeforeCompany)
ggpairs(feat5, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Education, Number of Companies, and Years before company also look like they could add some information.

attr$LogDistanceFromHome = log(attr$DistanceFromHome + 1)
feat6  = attr %>% select(Attrition, DailyRate, HourlyRate, MonthlyRate, LogDistanceFromHome)
ggpairs(feat6, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

feat6  = attr %>% select(Attrition, EmployeeNumber, TrainingTimesLastYear, StockOptionLevel, WorkLifeBalance)
ggpairs(feat6, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Stock level and work/life balalance also look like they could add some influnece.

feat7  = attr %>% select(Attrition, EnvironmentSatisfaction, JobInvolvement, RelationshipSatisfaction, JobSatisfaction)
ggpairs(feat7, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can see if taking a single measure accross satiscaction field ties the inforation together

attr$pressurePoint = pmin(
  attr$JobSatisfaction, 
  attr$EnvironmentSatisfaction, 
  attr$RelationshipSatisfaction
)
attr$TotalSatisfaction = pmax(
  attr$JobSatisfaction, 
  attr$EnvironmentSatisfaction, 
  attr$RelationshipSatisfaction
)
attr$pressurePointFull = pmin(
  attr$JobSatisfaction, 
  attr$EnvironmentSatisfaction, 
  attr$RelationshipSatisfaction, 
  attr$JobInvolvement, 
  attr$WorkLifeBalance
)
attr$TotalSatisfactionFull = pmax(
  attr$JobSatisfaction, 
  attr$EnvironmentSatisfaction, 
  attr$RelationshipSatisfaction, 
  attr$JobInvolvement, 
  attr$WorkLifeBalance
)
feat7  = attr %>% select(Attrition, pressurePoint, TotalSatisfaction, pressurePointFull, TotalSatisfactionFull)
ggpairs(feat7, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cat_feats = attr %>% select(    
    Attrition,
    BusinessTravel,
    Department,
    EducationField,
    Gender,
    JobRole,
    MaritalStatus,
    OverTime
)
ggplot(data = cat_feats, mapping = aes(x=JobRole, fill = Attrition)) + geom_bar(position="fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

tr = ggplot(data = cat_feats, mapping = aes(x=BusinessTravel, fill = Attrition)) + geom_bar(position="fill")
dep = ggplot(data = cat_feats, mapping = aes(x=Department, fill = Attrition)) + geom_bar(position="fill")
ge = ggplot(data = cat_feats, mapping = aes(x=Gender, fill = Attrition)) + geom_bar(position="fill")
ma = ggplot(data = cat_feats, mapping = aes(x=MaritalStatus, fill = Attrition)) + geom_bar(position="fill")
ov = ggplot(data = cat_feats, mapping = aes(x=OverTime, fill = Attrition)) + geom_bar(position="fill")
grid.arrange(tr, dep, ge, ma, ov)

ggplot(data = cat_feats, mapping = aes(x=EducationField, fill = Attrition)) + geom_bar(position="fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(data = attr, mapping = aes(x=JobRole, y=LogMonthlyIncome)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

It is interesting to note that the job levels with the highest salary also have the lowest job satisfcation.

ggplot(data = attr, mapping = aes(x=JobRole, y= JobSatisfaction )) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stock option has more of a bathtub shaped attrition curve.

ggplot(data = attr, mapping = aes(x=StockOptionLevel, fill = Attrition)) + geom_bar(position="fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Drop the columns that are are not useful or those that have been replaced by their log version

drop_cols = c(
  'YearsAtCompany',
  'YearsInCurrentRole',
  'LogYearsSinceLastPromotion',
  'YearsWithCurrManager',
  'YearsBeforeCompany',
  'TotalWorkingYears',
  'NumCompaniesWorked',
  'MonthlyIncome',
  'DistanceFromHome',
  'Gender',
  'EmployeeNumber'
)
attr = attr %>% select(-one_of(drop_cols))
attr$Attrition <- attr$Attrition %>% recode('Yes'=1,'No'= 0)
attr$BusinessTravel <- attr$BusinessTravel %>% recode('Travel_Frequently'=2,'Travel_Rarely'=1,'Non-Travel'= 0)
attr$Department <- attr$Department %>% recode('Sales'=2,'Human Resources'=1,'Research & Development'= 0)
attr$EducationField <- attr$EducationField %>% recode('Medical'=0,'Life Sciences'=1,'Other'=2, 'Marketing'=3, 'Technical Degree'=4, 'Human Resources'=5)
attr$OverTime <- attr$OverTime %>% recode('Yes'=1,'No'= 0)
attr$MaritalStatus <- attr$MaritalStatus %>% recode('Divorced'=2,'Married'=1,'Single'= 0)
attr$JobRole <- attr$JobRole %>% recode(
  'Sales Representative'=0, 
  'Human Resources'=1, 
  'Laboratory Technician'=2, 
  'Sales Executive'=3, 
  'Research Scientist'=4, 
  'Healthcare Representative'=5, 
  'Manager'=6,
  'Research Director'=7,
  'Manufacturing Director'=7
)

Split into train and test sets to evalute the performance of the KNN model

splitPerc = .70
trainIndices = sample(1:dim(attr)[1],round(splitPerc * dim(attr)[1]))
train = attr[trainIndices,]
test = attr[-trainIndices,]
train_labels = train$Attrition
test_labels = test$Attrition

test = test %>% select(-Attrition)
train = train %>% select(-Attrition)

accs = data.frame(sens = numeric(30), spec = numeric(30), k = numeric(30))

for(i in 1:30)
{
  classifications = knn(train, test, train_labels, prob = TRUE, k = i)
  table(test_labels, classifications)
  CM = confusionMatrix(table(test_labels, classifications))
  accs$sens[i] = CM$byClass['Sensitivity']
  accs$spec[i] = CM$byClass['Specificity']
  accs$k[i] = i
}

accs[is.na(accs)] = 0
print(max(accs$spec))
## [1] 0.5

Taking a subset of the features produces a mch better model.

set.seed(742)

exp_features = attr %>% select(
  Attrition,
  JobSatisfaction,
  MaritalStatus,
  JobRole,
  StockOptionLevel,
  LogMonthlyIncome,
  LogYearsInCurrentRole,
)

splitPerc = .70
trainIndices = sample(1:dim(exp_features)[1],round(splitPerc * dim(exp_features)[1]))
train = exp_features[trainIndices,]
test = exp_features[-trainIndices,]
train_labels = train$Attrition
test_labels = test$Attrition

test = test %>% select(-Attrition)
train = train %>% select(-Attrition)

accs = data.frame(sens = numeric(40), spec = numeric(40), k = numeric(40))

for(i in 1:40)
{
  classifications = knn(train, test, train_labels, prob = TRUE, k = i)
  table(test_labels, classifications)
  CM = confusionMatrix(table(test_labels, classifications))
  accs$sens[i] = CM$byClass['Sensitivity']
  accs$spec[i] = CM$byClass['Specificity']
  accs$k[i] = i
}

print(accs)
##         sens      spec  k
## 1  0.8711111 0.3055556  1
## 2  0.8767123 0.3095238  2
## 3  0.8765432 0.5555556  3
## 4  0.8663968 0.5000000  4
## 5  0.8650794 0.6666667  5
## 6  0.8565737 0.4000000  6
## 7  0.8611111 0.5555556  7
## 8  0.8645418 0.6000000  8
## 9  0.8616601 0.6250000  9
## 10 0.8656126 0.7500000 10
## 11 0.8622047 0.7142857 11
## 12 0.8622047 0.7142857 12
## 13 0.8622047 0.7142857 13
## 14 0.8549020 0.5000000 14
## 15 0.8588235 0.6666667 15
## 16 0.8554688 0.6000000 16
## 17 0.8554688 0.6000000 17
## 18 0.8549020 0.5000000 18
## 19 0.8549020 0.5000000 19
## 20 0.8543307 0.4285714 20
## 21 0.8554688 0.6000000 21
## 22 0.8549020 0.5000000 22
## 23 0.8549020 0.5000000 23
## 24 0.8554688 0.6000000 24
## 25 0.8560311 0.7500000 25
## 26 0.8554688 0.6000000 26
## 27 0.8560311 0.7500000 27
## 28 0.8560311 0.7500000 28
## 29 0.8560311 0.7500000 29
## 30 0.8527132 0.6666667 30
## 31 0.8560311 0.7500000 31
## 32 0.8521401 0.5000000 32
## 33 0.8527132 0.6666667 33
## 34 0.8527132 0.6666667 34
## 35 0.8527132 0.6666667 35
## 36 0.8527132 0.6666667 36
## 37 0.8527132 0.6666667 37
## 38 0.8532819 1.0000000 38
## 39 0.8532819 1.0000000 39
## 40 0.8527132 0.6666667 40
comp_noattr = read_csv("./data/CaseStudy2CompSet No Attrition.csv",
  col_types = cols(
    JobRole = col_factor(),
    MaritalStatus = col_factor(),
    MonthlyIncome = col_integer(),
    YearsInCurrentRole = col_integer(),
    JobSatisfaction = col_integer(),
    StockOptionLevel = col_integer()
  )
)
comp_noattr$LogYearsInCurrentRole = log(comp_noattr$YearsInCurrentRole + 1)
comp_noattr$LogMonthlyIncome = log(comp_noattr$MonthlyIncome + 1)

comp_noattr$BusinessTravel <- comp_noattr$BusinessTravel %>% recode('Travel_Frequently'=2,'Travel_Rarely'=1,'Non-Travel'= 0)
comp_noattr$Department <- comp_noattr$Department %>% recode('Sales'=2,'Human Resources'=1,'Research & Development'= 0)
comp_noattr$EducationField <- comp_noattr$EducationField %>% recode('Medical'=0,'Life Sciences'=1,'Other'=2, 'Marketing'=3, 'Technical Degree'=4, 'Human Resources'=5)
comp_noattr$OverTime <- comp_noattr$OverTime %>% recode('Yes'=1,'No'= 0)
comp_noattr$MaritalStatus <- comp_noattr$MaritalStatus %>% recode('Divorced'=2,'Married'=1,'Single'= 0)
comp_noattr$JobRole <- comp_noattr$JobRole %>% recode(
  'Sales Representative'=0, 
  'Human Resources'=1, 
  'Laboratory Technician'=2, 
  'Sales Executive'=3, 
  'Research Scientist'=4, 
  'Healthcare Representative'=5, 
  'Manager'=6,
  'Research Director'=7,
  'Manufacturing Director'=7
)

comp_noattr_model = comp_noattr %>% select(
  JobSatisfaction,
  MaritalStatus,
  JobRole,
  StockOptionLevel,
  LogMonthlyIncome,
  LogYearsInCurrentRole,
)

comp_classifications = knn(train, comp_noattr_model, train_labels, k = 10)
comp_noattr$Attribution =  comp_classifications
comp_noattr = comp_noattr %>% select(ID, Attribution)
write_csv(comp_noattr, 'Case2PredictionsDhillon Attrition.csv')
income_feats = attr %>% mutate(EducationField = factor(EducationField), JobRole = factor(JobRole))
income_feats = income_feats %>% select(Age, JobRole, EducationField, LogTotalWorkingYears, LogMonthlyIncome)

splitPerc = .70
trainIndices = sample(1:dim(exp_features)[1],round(splitPerc * dim(exp_features)[1]))
train = income_feats[trainIndices,]
test = income_feats[-trainIndices,]

income_model <- lm(LogMonthlyIncome ~ Age + JobRole + LogTotalWorkingYears, data=train)
summary(income_model)
## 
## Call:
## lm(formula = LogMonthlyIncome ~ Age + JobRole + LogTotalWorkingYears, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7945 -0.2138  0.0181  0.2068  0.8461 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.3461889  0.0636052 115.497   <2e-16 ***
## Age                  -0.0005182  0.0017342  -0.299    0.765    
## JobRole1              0.0464489  0.0943680   0.492    0.623    
## JobRole2             -0.0714951  0.0558274  -1.281    0.201    
## JobRole3              0.5598298  0.0571986   9.787   <2e-16 ***
## JobRole4             -0.0682678  0.0552925  -1.235    0.217    
## JobRole5              0.5975436  0.0652909   9.152   <2e-16 ***
## JobRole6              1.2268091  0.0779754  15.733   <2e-16 ***
## JobRole7              0.8238575  0.0621791  13.250   <2e-16 ***
## LogTotalWorkingYears  0.3761713  0.0259274  14.509   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2951 on 599 degrees of freedom
## Multiple R-squared:  0.8072, Adjusted R-squared:  0.8043 
## F-statistic: 278.7 on 9 and 599 DF,  p-value: < 2.2e-16
comp_noIncome = read_csv("./data/CaseStudy2CompSet No Salary.csv",
  col_types = cols(
    JobRole = col_factor(),
    TotalWorkingYears = col_integer(),
    Age = col_integer()
  )
)
comp_noIncome$JobRole <- comp_noIncome$JobRole %>% recode(
  'Sales Representative'=0, 
  'Human Resources'=1, 
  'Laboratory Technician'=2, 
  'Sales Executive'=3, 
  'Research Scientist'=4, 
  'Healthcare Representative'=5, 
  'Manager'=6,
  'Research Director'=7,
  'Manufacturing Director'=7
)
comp_noIncome = comp_noIncome %>% mutate(JobRole = factor(JobRole))

comp_noIncome$LogTotalWorkingYears = log(comp_noIncome$TotalWorkingYears + 1)
comp_noIncome_model = comp_noIncome %>% select(Age, JobRole, LogTotalWorkingYears)
pred_salary = predict(income_model, comp_noIncome_model)
comp_noIncome$LogMonthlyIncome = pred_salary
comp_noIncome$MonthlyIncome = exp(comp_noIncome$LogMonthlyIncome) - 1
comp_noIncome = comp_noIncome %>% select(ID, MonthlyIncome)
write_csv(comp_noIncome, 'Case2PredictionsDhillon Salary.csv')